import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import regex as re
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import random
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, NoSuchElementException

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

###   # First scraping: job listings

all_links = []
locs = []
df = pd.DataFrame()
titles = []
err = []

# Scraping loop for job listings
for i in range(0, 500, 10):  # page indices
    print(f"Processing page {i}")
    
    # Site of job
    url = f"https://url.com/search?keywords=specialist&from={i}&results=10"
    driver.get(url)
    driver.maximize_window()
    
    time.sleep(5)
    
    try:
        # Alternative method:Wait for main content to load
        # WebDriverWait(driver, 30).until(
        #     EC.presence_of_element_located((By.CLASS_NAME, "JOB_LISTING_CONTAINER"))
        # )
        
        # Wait for specific number of elements; e.g., job titles
        WebDriverWait(driver, 30).until(lambda d: len(d.find_elements(By.CLASS_NAME, "job-card")) >= 5)
        
        # Extract all job URLs on this page
        job_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/career/')]")
        links = [elem.get_attribute("href") for elem in job_elements]
        titles = [elem.text for elem in job_elements]
        all_links.extend(links)

        # Extract all location information on this page
        location_elements = driver.find_elements(By.XPATH, "//span[contains(@class, 'location-text')]")
        loc = [elem.text.strip() for elem in location_elements]
        
        # Sometimes location information is not readily available on the same page
        # In those cases, scrape location through Google search or company store locators (see below)

        locs.extend(loc)
        
        # Extract all posting dates on this page
        date_elements = driver.find_elements(By.CLASS_NAME, "posting-date")
        pdate = []
        for elem in date_elements:
            date_match = re.search(r'\d{1,2}/\d{1,2}/\d{4}', elem.text)
            if date_match:
                pdate.append(date_match.group(0))
        
        # Validating consistency of extracted information
        if len(loc) == len(links) == len(pdate) == len(titles) == 10:
            new_df = pd.DataFrame({
                "location": loc,
                "title":titles,
                "url": links, 
                "page_index": i,
                "post_date": pdate
            })
            df = pd.concat([df, new_df], ignore_index=True)
    
    # Marking error pages to revisit
    except TimeoutException:
        err.append(i)
        continue
    except Exception as e:
        err.append(i)
        continue
 
 ###   # Second scraping: check (entry-level) qualification requirements
       # Keywords are manually modified on a company-basis

    qualified_titles = []
    
    qualified_jobs_df = pd.DataFrame()

    def check_entry_level_qualification(url,title):
        if title in qualified_titles:
            new_row = pd.DataFrame({
                    "title": [title],
                    "url": [url]
                })
            return new_row
        
        try:
            driver.get(url)
            time.sleep(5)
            
            # Wait for job description to load
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, "job-details-content"))
            )
            
            # Extract job description text
            job_description = driver.find_element(By.CLASS_NAME, "job-details-content").text
            text_lines = job_description.split("\n")
            
            # Find requirements section
            try:
                req_index = next(i for i, line in enumerate(text_lines) 
                               if any(keyword in line.lower() for keyword in ["requirements:", "required:", "qualifications:"]))
            except StopIteration:
                req_index = None
            
            # Find end of requirements
            try:
                end_index = next(i for i, line in enumerate(text_lines[req_index+1:], req_index+1) 
                               if any(keyword in line.lower() for keyword in ["desired:", "preferred:", "responsibilities:", "duties:"]))
            except (StopIteration, TypeError):
                end_index = len(text_lines)
            
            # Extract requirements text
            if req_index is not None:
                requirements = text_lines[req_index+1:end_index]
                requirements_text = " ".join(requirements).lower()
            else:
                return False
            
            # Check if job is full-time; sometimes this is taken care of by the filters when scraping
            full_time_indicators = ["full-time", "full time", "40 hours", "FT"]
            is_full_time = any(indicator in job_description.lower() for indicator in full_time_indicators)
            
            # Check education requirements
            education_valid = False
            high_school_terms = ["high school", "ged", "diploma", "secondary education", "highschool", "hs"]
            college_terms = ["bachelor", "degree", "college", "university", "associate", "certification", "certificate"]
            
            has_hs_requirement = any(term in requirements_text for term in high_school_terms)
            has_college_requirement = any(term in requirements_text for term in college_terms)
            
            if has_hs_requirement and not has_college_requirement:
                education_valid = True
            elif not has_hs_requirement and not has_college_requirement:
                education_valid = True
            
            # Check for experience requirements
            experience_disqualifiers = [
                "years of experience", "experience required", "previous experience"
            ]
            has_experience_req = any(disqualifier in requirements_text for disqualifier in experience_disqualifiers)
            
            # Check for special licenses/certificates (excluding driver's license)
            license_disqualifiers = [
                "professional license", "certification required", "licensed", "certified"
            ]
            driver_license_terms = ["driver's license", "drivers license", "valid driver", "driving license"]
            
            has_special_license = False
            for disqualifier in license_disqualifiers:
                if disqualifier in requirements_text:
                    if not any(dl_term in requirements_text for dl_term in driver_license_terms):
                        has_special_license = True
                        break
            
            # Final qualification check
            is_qualified = (is_full_time and 
                          education_valid and 
                          not has_experience_req and 
                          not has_special_license)
            
            if is_qualified:
                new_row = pd.DataFrame({
                    "title": [title],
                    "url": [url],
                    "requirements": ["; ".join(requirements)]
                })
                return new_row
    # Append qualified rows outside of the function
    # qualified_jobs_df = pd.concat([qualified_jobs_df, new_row])


 ###   # Third scraping: job locations (using store locator)
    address = []
    used = []
    err = []
    # Get store identifiers from previous scraping
    location_terms = df['location'].str.extract(r'Store (\d{4,5})')[0].dropna().tolist()
    
    for term in location_terms:
        print(f"Processing location: {term}")
        
        try:
            # Construct location URL
            location_url = f"https://url.com/store/{term}"
            driver.get(location_url)
            
            # Random delay
            time.sleep(5)
            
            # Verify we're on the correct page
            if driver.current_url == location_url:
                print("URL verified")
                
                # Wait for location content
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "store-details"))
                )
                
                # Extract address information
                address_elements = driver.find_elements(By.CLASS_NAME, "address-line")
                full_address = " ".join([elem.text.strip() for elem in address_elements])
                
                address.append(full_address)
                used.append(term)
                
            else:
                err.append(term)
                continue
                
        except TimeoutException:
            err.append(term)
            continue
        except NoSuchElementException:
            err.append(term)
            continue
        except Exception as e:
            err.append(term)
            continue

    location_df = pd.DataFrame({"store_id": used, "full_address": address})
